library(car)
library(knitr)
library(kableExtra)
library(tidyverse)
library(multcompView)
library(MASS)
library(ggplot2)
library(cowplot)
##Function used to add Tukey Letters to Boxplot Figures
generate_label_df <- function(TUKEY, variable){
  
  #Extract labels and factor levels from Tukey post-hoc test
  Tukey.levels <- TUKEY[[variable]][, 4]
  Tukey.labels <- data.frame(multcompLetters(Tukey.levels)['Letters'])
  
  #Putting the labels in the same order as in the boxplot :
  Tukey.labels$treatment=rownames(Tukey.labels)
  Tukey.labels=Tukey.labels[order(Tukey.labels$treatment) , ]
  return(Tukey.labels)
}
geo_data <- read.csv("processed_data/geo.csv")
geo_data$Location <- as.factor(geo_data$Location)

##The subsetted data created as per the next section
geo_subset <- read.csv("processed_data/geovar.csv")
geo_subset$Location <- as.factor(geo_subset$Location)

##This dataset is used for the band area vs iridescence analysis
area_iri <- read.csv("processed_data/ehs-LOCATION.csv")

Subsetting the dataset

In order to run a one-way ANOVA to see if iridescence is different across the different geographic populations, we needed an equal sample size for each pop. The original dataset contained 18, 18, 17, and 19 female pipefish from the Florida Anne’s Beach, Florida Fort De Soto, Texas South Padre, and Texas Port Aransas population respectively. The smallest sample size (TX South Padre), was then chosen as the number randomly sampled from the other populations.

##Subsetting the individual populations
FLAB<-subset(geo_data,Location=="FLAB")
FLFD<-subset(geo_data,Location=="FLFD")
TXSP<-subset(geo_data,Location=="TXSP")
TXPA<-subset(geo_data,Location=="TXPA")

##Randomly selecting 17 from each population
flabw<-FLAB[sample(1:nrow(FLAB),17,replace=FALSE),]
flfdw<-FLFD[sample(1:nrow(FLFD),17,replace=FALSE),]
txspw<-TXSP[sample(1:nrow(TXSP),17,replace=FALSE),]
txpaw<-TXPA[sample(1:nrow(TXPA),17,replace=FALSE),]
set.seed(123) ##Allows for reproducability if re-run in the same session

##Combining the new subsets in a new dataset
geovar<-rbind(flabw,flfdw,txspw,txpaw)

##Writing out the dataset to ensure it is the same if further analysis is needed
write.csv(geovar,"geovar.csv")

Analysis

Summarized Data
Location depth band_num mean_hard total_easy
FLAB 7.131941 15.88235 0.2551946 12.177635
FLFD 7.141000 15.29412 0.2427085 6.636094
TXPA 5.686471 17.64706 0.1728293 7.765618
TXSP 3.993647 18.94118 0.1774011 6.760818
Note:
Summarized here are the means across location for each of the variables

Length across Location

Summary table

Summarized Data
Location length_mean length_SD
FLAB 119.68006 12.294545
FLFD 123.96988 13.118810
TXPA 105.34053 9.071285
TXSP 93.53071 8.737943

Histogram

Histogram of female body length (mm) across the four locations

Histogram of female body length (mm) across the four locations

Testing assumptions

leveneTest(length~Location, geo_subset)
## Levene's Test for Homogeneity of Variance (center = median)
##       Df F value Pr(>F)
## group  3  1.2105 0.3131
##       64
lengthaov<-aov(geo_subset$length~geo_subset$Location)
lengthresid<-resid(lengthaov)
shapiro.test(lengthresid)
## 
##  Shapiro-Wilk normality test
## 
## data:  lengthresid
## W = 0.96973, p-value = 0.09677
par(mfrow=c(2,2))
plot(lengthaov)

All assumptions met, can proceed with one-way ANOVA.

Running one-way ANOVA between Length and Location

summary(lengthaov)
##                     Df Sum Sq Mean Sq F value   Pr(>F)    
## geo_subset$Location  3   9864    3288   27.29 1.74e-11 ***
## Residuals           64   7710     120                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Post-hoc Analysis
tukey_length <- TukeyHSD(x=lengthaov, 'geo_subset$Location', conf.level = 0.95)
tukey_length
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = geo_subset$length ~ geo_subset$Location)
## 
## $`geo_subset$Location`
##                 diff        lwr        upr     p adj
## FLFD-FLAB   4.289824  -5.641033  14.220681 0.6666855
## TXPA-FLAB -14.339529 -24.270386  -4.408672 0.0017542
## TXSP-FLAB -26.149353 -36.080210 -16.218496 0.0000000
## TXPA-FLFD -18.629353 -28.560210  -8.698496 0.0000335
## TXSP-FLFD -30.439176 -40.370033 -20.508319 0.0000000
## TXSP-TXPA -11.809824 -21.740681  -1.878967 0.0134179

Significant ANOVA results. From the post-hoc analysis we can see significant differences between the lengths of females from the Florida and Texas populations.

## $`geo_subset$Location`
## FLFD FLAB TXPA TXSP 
##  "a"  "a"  "b"  "c"

Snout-vent Length across Location

Summary table

Summarized Data
Location SVL_mean SVL_SD
FLAB 48.29176 5.094617
FLFD 50.33812 5.884782
TXPA 42.53024 3.324913
TXSP 38.90365 3.762170

Histogram

Histogram of female snout-vent length (mm) across the four locations

Histogram of female snout-vent length (mm) across the four locations

Testing assumptions

leveneTest(SVL~Location, geo_subset)
## Levene's Test for Homogeneity of Variance (center = median)
##       Df F value Pr(>F)
## group  3  1.2268 0.3073
##       64
SVLaov<-aov(geo_subset$SVL~geo_subset$Location)
SVLresid<-resid(SVLaov)
shapiro.test(SVLresid)
## 
##  Shapiro-Wilk normality test
## 
## data:  SVLresid
## W = 0.98184, p-value = 0.4242
par(mfrow=c(2,2))
plot(SVLaov)

All assumptions met, can proceed with one-way ANOVA.

Running one-way ANOVA between Snout-vent Length and Location

summary(SVLaov)
##                     Df Sum Sq Mean Sq F value   Pr(>F)    
## geo_subset$Location  3   1404   468.0   21.82 7.54e-10 ***
## Residuals           64   1373    21.4                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Post-hoc Analysis
tukey_SVL <- TukeyHSD(x=SVLaov, 'geo_subset$Location', conf.level = 0.95)
tukey_SVL
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = geo_subset$SVL ~ geo_subset$Location)
## 
## $`geo_subset$Location`
##                 diff        lwr        upr     p adj
## FLFD-FLAB   2.046353  -2.143889  6.2365951 0.5737203
## TXPA-FLAB  -5.761529  -9.951772 -1.5712873 0.0031235
## TXSP-FLAB  -9.388118 -13.578360 -5.1978755 0.0000009
## TXPA-FLFD  -7.807882 -11.998124 -3.6176402 0.0000378
## TXSP-FLFD -11.434471 -15.624713 -7.2442285 0.0000000
## TXSP-TXPA  -3.626588  -7.816830  0.5636539 0.1127820

Significant ANOVA results. From the post-hoc analysis we can see significant differences between the lengths of females from the Florida and Texas populations.

## $`geo_subset$Location`
## FLFD FLAB TXPA TXSP 
##  "a"  "a"  "b"  "b"

Depth across Location

Summary table

Summarized Data
Location depth_mean depth_SD
FLAB 7.131941 1.0159922
FLFD 7.141000 1.1368301
TXPA 5.686471 0.6121274
TXSP 3.993647 0.6627263

Histogram

Histogram of female body depth(mm) across the four locations

Histogram of female body depth(mm) across the four locations

Testing assumptions

leveneTest(depth~Location, geo_subset)
## Levene's Test for Homogeneity of Variance (center = median)
##       Df F value Pr(>F)
## group  3  2.0593 0.1144
##       64
depthaov<-aov(geo_subset$depth~geo_subset$Location)
depthresid<-resid(depthaov)
shapiro.test(depthresid)
## 
##  Shapiro-Wilk normality test
## 
## data:  depthresid
## W = 0.97809, p-value = 0.2749
par(mfrow=c(2,2))
plot(depthaov)

All assumptions met, can proceed with one-way ANOVA.

Running one-way ANOVA between Depth and Location

summary(depthaov)
##                     Df Sum Sq Mean Sq F value Pr(>F)    
## geo_subset$Location  3 114.01   38.00   48.43 <2e-16 ***
## Residuals           64  50.22    0.78                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Post-hoc Analysis
tukey_depth <- TukeyHSD(x=depthaov, 'geo_subset$Location', conf.level = 0.95)
tukey_depth
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = geo_subset$depth ~ geo_subset$Location)
## 
## $`geo_subset$Location`
##                   diff        lwr        upr     p adj
## FLFD-FLAB  0.009058824 -0.7923827  0.8105003 0.9999904
## TXPA-FLAB -1.445470588 -2.2469121 -0.6440291 0.0000672
## TXSP-FLAB -3.138294118 -3.9397356 -2.3368526 0.0000000
## TXPA-FLFD -1.454529412 -2.2559709 -0.6530879 0.0000603
## TXSP-FLFD -3.147352941 -3.9487945 -2.3459114 0.0000000
## TXSP-TXPA -1.692823529 -2.4942651 -0.8913820 0.0000032

Significant ANOVA results. From the post-hoc analysis we can see significant differences between the depths of females from the Florida and Texas populations and between the two Texas populations.

## $`geo_subset$Location`
## FLFD FLAB TXPA TXSP 
##  "a"  "a"  "b"  "c"

Because of the difference in depths across the locations for all of the subsequent analyses iridescent measurements will be adjusted for by depth (iridescence/depth) resulting in Depth-adjust iridescence measurements.

Investigating relationship between Depth and Length

Across the four locations Depth and Length show the same pattern. Florida females are deeper and longer than the Texas females. Here I am testing to see what the correlation is between depth and length.

cor.test(geo_subset$length, geo_subset$depth) ##0.87
## 
##  Pearson's product-moment correlation
## 
## data:  geo_subset$length and geo_subset$depth
## t = 14.325, df = 66, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.7966505 0.9179094
## sample estimates:
##       cor 
## 0.8698542
plot(geo_subset$length, geo_subset$depth, xlab = "Length (mm)", ylab = "Depth (mm)", pch = 16)
abline(lm(depth~length,data=geo_subset),lty=3, col="cyan4",lwd=3)

Due to the strong correlation between length and depth I will not be including both in the model as I am trying to avoid autocorrelation between my predictor variables. Depth will have a larger impact on the amount of iridescence present as the majority of the iridescence is present on the torso. Therefore, have a deeper torso means there is more area for iridescence to be present one. Going forward this is why a depth-adjusted measurment of iridescence is used.

Number of female bands across Location

Summary table

Summarized Data
Location band_num_mean band_num_SD
FLAB 15.88235 1.833110
FLFD 15.29412 1.212678
TXPA 17.64706 2.370158
TXSP 18.94118 2.357715

Histogram

Histogram of number of iridescent female bands across the four locations

Histogram of number of iridescent female bands across the four locations

Testing assumptions

leveneTest(band_num~Location, geo_subset)
## Levene's Test for Homogeneity of Variance (center = median)
##       Df F value Pr(>F)
## group  3  1.3565 0.2641
##       64
band_num_aov<-aov(geo_subset$band_num~geo_subset$Location)
band_numresid<-resid(band_num_aov)
shapiro.test(band_numresid)
## 
##  Shapiro-Wilk normality test
## 
## data:  band_numresid
## W = 0.98035, p-value = 0.3589
par(mfrow=c(2,2))
plot(band_num_aov)

Running one-way ANOVA between Number of Bands and Location

summary(band_num_aov)
##                     Df Sum Sq Mean Sq F value   Pr(>F)    
## geo_subset$Location  3  141.7   47.22    11.8 3.02e-06 ***
## Residuals           64  256.1    4.00                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Post-hoc Analysis
tukey_bandnum <- TukeyHSD(x=band_num_aov, 'geo_subset$Location', conf.level = 0.95)
tukey_bandnum
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = geo_subset$band_num ~ geo_subset$Location)
## 
## $`geo_subset$Location`
##                 diff         lwr      upr     p adj
## FLFD-FLAB -0.5882339 -2.39819187 1.221724 0.8266614
## TXPA-FLAB  1.7647066 -0.04525135 3.574665 0.0586368
## TXSP-FLAB  3.0588240  1.24886609 4.868782 0.0001961
## TXPA-FLFD  2.3529405  0.54298258 4.162898 0.0057270
## TXSP-FLFD  3.6470580  1.83710002 5.457016 0.0000085
## TXSP-TXPA  1.2941174 -0.51584050 3.104075 0.2442289

Significant ANOVA results. From the post-hoc analysis we can see significant differences between the number of bands that females from the Florida population have versus females from the Texas populations.

## $`geo_subset$Location`
## TXSP TXPA FLAB FLFD 
##  "a" "ab" "bc"  "c"

Creating Figure

Depth-adjusted Mean Band Iridescence across Location

Summary table

Summarized Data
Location DAI_hard_mean DAI_hard_SD
FLAB 0.0350523 0.0217445
FLFD 0.0333968 0.0161817
TXPA 0.0301637 0.0116977
TXSP 0.0441479 0.0106199

Histogram

Histogram of Depth-adjusted mean band iridescence across the four locations

Histogram of Depth-adjusted mean band iridescence across the four locations

Testing assumptions

leveneTest(DAI_hard~Location, geo_subset)
## Levene's Test for Homogeneity of Variance (center = median)
##       Df F value Pr(>F)
## group  3  0.8039 0.4963
##       64
bandiri <- aov(DAI_hard~Location, geo_subset)
bandiriresid <- resid(bandiri)
shapiro.test(bandiriresid)
## 
##  Shapiro-Wilk normality test
## 
## data:  bandiriresid
## W = 0.9236, p-value = 0.0004666
par(mfrow=c(2,2))
plot(bandiri)

All assumptions are not met, data fails normality as tested with shapiro.test. Cannot run a simple one-way ANOVA. Instead will use a GLMM with an underlying quasi-poisson distribution.

Running the model between Depth-adjusted Mean Band Iridescence and Location

bandiriPS <- glmmPQL(DAI_hard~Location,
                     random=~1|Location,
                     family=quasipoisson,
                     data=geo_subset)
## iteration 1
Anova(bandiriPS, type="III")
## Analysis of Deviance Table (Type III tests)
## 
## Response: zz
##                 Chisq Df Pr(>Chisq)    
## (Intercept) 1000.5705  1    < 2e-16 ***
## Location       7.6117  3    0.05476 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Non-significant results, mean band iridescence does not differ between populations.

Depth-adjusted Overall Iridescence across Location

Summary table

Summarized Data
Location DAIT_easy_mean DAIT_easy_SD
FLAB 1.6669564 0.6231963
FLFD 0.8983221 0.3434524
TXPA 1.3484604 0.4134546
TXSP 1.6624073 0.4279989

Histogram

Histogram of Depth-adjusted overall iridescence across the four locations

Histogram of Depth-adjusted overall iridescence across the four locations

Testing assumptions

leveneTest(DAIT_easy~Location, geo_subset)
## Levene's Test for Homogeneity of Variance (center = median)
##       Df F value Pr(>F)
## group  3  2.0445 0.1165
##       64
overalliri <- aov(geo_subset$DAIT_easy~geo_subset$Location)
overalliriresid <- resid(overalliri)
shapiro.test(overalliriresid)
## 
##  Shapiro-Wilk normality test
## 
## data:  overalliriresid
## W = 0.97826, p-value = 0.2806
par(mfrow=c(2,2))
plot(overalliri)

All assumptions are met, can proceed with a one-way ANOVA

Running the one-way ANOVA between Depth-adjusted Overall Iridescence and Location

summary(overalliri)
##                     Df Sum Sq Mean Sq F value   Pr(>F)    
## geo_subset$Location  3  6.703  2.2345   10.39 1.17e-05 ***
## Residuals           64 13.767  0.2151                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##Post-hoc analysis
tukey_overalliri <- TukeyHSD(x=overalliri, 'geo_subset$Location', conf.level = 0.95)
tukey_overalliri
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = geo_subset$DAIT_easy ~ geo_subset$Location)
## 
## $`geo_subset$Location`
##                   diff         lwr        upr     p adj
## FLFD-FLAB -0.768634342 -1.18827189 -0.3489968 0.0000513
## TXPA-FLAB -0.318496071 -0.73813362  0.1011415 0.1980000
## TXSP-FLAB -0.004549171 -0.42418672  0.4150884 0.9999915
## TXPA-FLFD  0.450138271  0.03050073  0.8697758 0.0308429
## TXSP-FLFD  0.764085171  0.34444763  1.1837227 0.0000570
## TXSP-TXPA  0.313946900 -0.10569065  0.7335844 0.2087737

Significant results from ANOVA. From the post-hoc analysis we can see signifcant differences between FL fort de soto population and the other three.

## $`geo_subset$Location`
## FLAB TXSP TXPA FLFD 
##  "a"  "a"  "a"  "b"

Creating Figure

Torso Iridescence across Location

Summary table

Summarized Data
Location body_iri_mean body_iri_SD
FLAB 7.946929 3.821406
FLFD 2.955685 2.254426
TXPA 4.630618 1.795938
TXSP 3.330288 1.579604

Histogram

Histogram of Torso iridescence across the four locations

Histogram of Torso iridescence across the four locations

Testing assumptions

leveneTest(body_iri~Location, geo_subset)
## Levene's Test for Homogeneity of Variance (center = median)
##       Df F value  Pr(>F)  
## group  3  3.5081 0.02018 *
##       64                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
torsoiri_aov <- aov(body_iri~Location, geo_subset)
torsoiriresid <- resid(torsoiri_aov)
shapiro.test(torsoiriresid)
## 
##  Shapiro-Wilk normality test
## 
## data:  torsoiriresid
## W = 0.96434, p-value = 0.04887
par(mfrow=c(2,2))
plot(torsoiri_aov)

All assumptions are not met, data fails normality as tested with shapiro.test. Cannot run a simple one-way ANOVA. Instead will use a GLMM with an underlying quasi-poisson distribution.

Running the model between Torso Iridescence and Location

torsoiriPS <- glmmPQL(body_iri~Location,
                    random=~1|Location,
                    family=quasipoisson,
                    data=geo_subset)
## iteration 1
Anova(torsoiriPS, type="III")
## Analysis of Deviance Table (Type III tests)
## 
## Response: zz
##               Chisq Df Pr(>Chisq)    
## (Intercept) 493.088  1  < 2.2e-16 ***
## Location     44.638  3  1.105e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Significant results, but no corresponding post-hoc test for this model so we can’t see which locations are sig.different from each other

Creating the Figure

Investigating the Relationship between Band Area and Band Iridescence

This analysis was done for three of the four locations used above as we pulled band area measurements from (CITATION) and a Port Aransas, TX population was not used in that study.

FLABehs <- subset(area_iri, Location=="FLAB")
FLFDehs <- subset(area_iri, Location=="FLFD")
TXSPehs <- subset(area_iri, Location=="TXSP")

Investigating correlations between area and iridescence

##Looking at the correlation in the whole dataset
cor.test(area_iri$mean_sarah, area_iri$mean_hard) ##0.3685
## 
##  Pearson's product-moment correlation
## 
## data:  area_iri$mean_sarah and area_iri$mean_hard
## t = 2.831, df = 51, p-value = 0.006624
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1090932 0.5809468
## sample estimates:
##       cor 
## 0.3685204
##Looking at correlations within each Location
cor.test(FLABehs$mean_sarah, FLABehs$mean_hard) ##0.2455675
## 
##  Pearson's product-moment correlation
## 
## data:  FLABehs$mean_sarah and FLABehs$mean_hard
## t = 1.0133, df = 16, p-value = 0.326
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.2499600  0.6391592
## sample estimates:
##       cor 
## 0.2455675
cor.test(FLFDehs$mean_sarah, FLFDehs$mean_hard) ##0.400215
## 
##  Pearson's product-moment correlation
## 
## data:  FLFDehs$mean_sarah and FLFDehs$mean_hard
## t = 1.7469, df = 16, p-value = 0.09983
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.08197123  0.73057780
## sample estimates:
##      cor 
## 0.400215
cor.test(TXSPehs$mean_sarah, TXSPehs$mean_hard) ##0.4177789
## 
##  Pearson's product-moment correlation
## 
## data:  TXSPehs$mean_sarah and TXSPehs$mean_hard
## t = 1.7809, df = 15, p-value = 0.09518
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.07866126  0.74818560
## sample estimates:
##       cor 
## 0.4177789

Overall no strong correlations seen between band area and band iridescence, with the lowest correlation in the FL Anne’s Beach population. This would probably be expected as those females possessed the most iridescence on the torso but not specifically located to the bands.

Creating Figure